{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Run Ad-Hoc Model Bias Analysis\n",
    "\n",
    "## Run Bias Analysis In The Notebook using `smclarify`\n",
    "https://github.com/aws/amazon-sagemaker-clarify"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q smclarify==0.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from smclarify.bias.report import *\n",
    "from smclarify.util.dataset import Datasets, german_lending_readable_values\n",
    "from typing import Dict\n",
    "from collections import defaultdict\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.csv')\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pre-Training Bias Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "facet_column = FacetColumn('product_category')\n",
    "label_column = LabelColumn('star_rating', df['star_rating'], [5])\n",
    "group_variable = df['product_category']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pre_training_report = bias_report(\n",
    "    df, \n",
    "    facet_column, \n",
    "    label_column, \n",
    "    stage_type=StageType.PRE_TRAINING, \n",
    "    group_variable=group_variable\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "pre_training_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Post-Training Bias Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {\n",
    "    'star_rating':  [1, 2, 3, 4, 5],\n",
    "    'review_body': ['Worst ever', 'Expected more', 'Its ok', 'I like it', 'I love it'],\n",
    "    'product_category': ['Gift Card', 'Gift Card', 'Gift Card', 'Toys', 'Toys'],\n",
    "    'star_rating_predicted': [1, 2, 3, 4, 5]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame (data, columns = ['star_rating','review_body', 'product_category','star_rating_predicted'])\n",
    "print (df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Convert data columns into `categorical` data type required for Clarify"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['star_rating'] = df['star_rating'].astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['star_rating_predicted'] = df['star_rating_predicted'].astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['product_category'] = df['product_category'].astype('category')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Configure Clarify"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "facet_column = FacetColumn(\n",
    "    name='product_category', \n",
    "    sensitive_values=['Gift Card']\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_column = LabelColumn(\n",
    "    name='star_rating', \n",
    "    data=df['star_rating'], \n",
    "    positive_label_values=[5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "value = pd.Categorical(5)\n",
    "print(type(value))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predicted_label_column = LabelColumn(\n",
    "    name='star_rating_predicted', \n",
    "    data=df['star_rating_predicted'], \n",
    "    positive_label_values=[5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_variable = df['product_category']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "post_training_report = bias_report(\n",
    "    df, \n",
    "    facet_column=facet_column, \n",
    "    label_column=label_column, \n",
    "    stage_type=StageType.POST_TRAINING, \n",
    "    predicted_label_column=predicted_label_column,\n",
    "    metrics=['DPPL', 'DI', 'DCA', 'DCR', 'RD', 'DAR', 'DRR', 'AD', 'CDDPL', 'TE'],\n",
    "    group_variable=group_variable\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Show Post-Training Bias Report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pprint import pprint\n",
    "pprint(post_training_report)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Appendix: Troubleshooting `smclarify` API calls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pos_predicted_index = smclarify.bias.report._positive_predicted_index(\n",
    "    predicted_label_data=df['star_rating_predicted'],\n",
    "    label_data=df['star_rating'],\n",
    "    positive_label_values=[5]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(pos_predicted_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(~pos_predicted_index.all())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if len(df.columns) != len(df.select_dtypes([np.number, bool]).columns):\n",
    "    print(len(df.columns))\n",
    "    print(len(df.select_dtypes([np.number, bool]).columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _my_positive_predicted_index(\n",
    "    predicted_label_data: pd.Series, label_data: pd.Series, positive_label_values: List[Any]\n",
    ") -> pd.Series:\n",
    "    \"\"\"\n",
    "    creates a list of bool series for positive predicted label index based on the input data type,\n",
    "    list of positive label values or intervals\n",
    "    :param predicted_label_data: input data for predicted label column\n",
    "    :param label_datatype:  input data for the label column\n",
    "    :param positive_label_values: list of positive label values\n",
    "    :return: list of positive predicted label index series\n",
    "    \"\"\"\n",
    "    predicted_label_datatype = common.series_datatype(predicted_label_data, positive_label_values)\n",
    "    label_datatype = common.series_datatype(label_data, positive_label_values)\n",
    "    if predicted_label_datatype != label_datatype:\n",
    "        raise ValueError(\"Predicted Label Column series datatype is not the same as Label Column series\")\n",
    "    try:\n",
    "        predicted_label_data = predicted_label_data.astype(label_data.dtype)\n",
    "    except ValueError as e:\n",
    "        raise ValueError(\n",
    "            \"Labels and predicted labels cannot have different types (%s, %s).\"\n",
    "            % (label_data.dtype, predicted_label_data.dtype)\n",
    "        )\n",
    "    if predicted_label_datatype == common.DataType.CONTINUOUS:\n",
    "        pass\n",
    "#        data_interval_indices = _interval_index(label_data.append(predicted_label_data), positive_label_values)\n",
    "#        positive_predicted_index = _continuous_data_idx(predicted_label_data, data_interval_indices)\n",
    "    elif predicted_label_datatype == common.DataType.CATEGORICAL and positive_label_values:\n",
    "        positive_predicted_index = _categorical_data_idx(predicted_label_data, positive_label_values)\n",
    "    else:\n",
    "        raise RuntimeError(\"Predicted Label_column data is invalid or can't be classified\")\n",
    "    # check if positive index boolean series has all False values\n",
    "    if (~positive_predicted_index).all():\n",
    "        raise ValueError(\n",
    "            \"No Label values are present in the predicted Label Column,\"\n",
    "            \"Positive Predicted Index Series contains all False values\"\n",
    "        )\n",
    "    return positive_predicted_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predicted_label_data=df['star_rating_predicted']\n",
    "positive_label_values=['5']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predicted_label_datatype = common.series_datatype(predicted_label_data, positive_label_values)\n",
    "print(predicted_label_datatype)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pos_predicted_index, inversed = _my_positive_predicted_index(\n",
    "#     predicted_label_data=df['star_rating_predicted'],\n",
    "#     label_data=df['star_rating'],\n",
    "#     positive_label_values=[5]\n",
    "# )"
   ]
  }
 ],
 "metadata": {
  "instance_type": "ml.t3.medium",
  "kernelspec": {
   "display_name": "Python 3 (Data Science)",
   "language": "python",
   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
